This document explores a data set which contains 113,937 loans with 81 variables on each loan, including loan amount, borrower rate (or interest rate), current loan status, borrower income, and many others.
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
sb.set_style("darkgrid")
%matplotlib inline
# load the dataset into a pandas dataframe
loan_original = pd.read_csv('prosperLoanData.csv')
# setting the display options for the dataframes
pd.set_option('display.max_columns',500)
pd.set_option('max_colwidth',800)
# high-level overview of data shape and composition
print(loan_original.shape)
(113937, 81)
loan_original.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 81 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingKey 113937 non-null object 1 ListingNumber 113937 non-null int64 2 ListingCreationDate 113937 non-null object 3 CreditGrade 28953 non-null object 4 Term 113937 non-null int64 5 LoanStatus 113937 non-null object 6 ClosedDate 55089 non-null object 7 BorrowerAPR 113912 non-null float64 8 BorrowerRate 113937 non-null float64 9 LenderYield 113937 non-null float64 10 EstimatedEffectiveYield 84853 non-null float64 11 EstimatedLoss 84853 non-null float64 12 EstimatedReturn 84853 non-null float64 13 ProsperRating (numeric) 84853 non-null float64 14 ProsperRating (Alpha) 84853 non-null object 15 ProsperScore 84853 non-null float64 16 ListingCategory (numeric) 113937 non-null int64 17 BorrowerState 108422 non-null object 18 Occupation 110349 non-null object 19 EmploymentStatus 111682 non-null object 20 EmploymentStatusDuration 106312 non-null float64 21 IsBorrowerHomeowner 113937 non-null bool 22 CurrentlyInGroup 113937 non-null bool 23 GroupKey 13341 non-null object 24 DateCreditPulled 113937 non-null object 25 CreditScoreRangeLower 113346 non-null float64 26 CreditScoreRangeUpper 113346 non-null float64 27 FirstRecordedCreditLine 113240 non-null object 28 CurrentCreditLines 106333 non-null float64 29 OpenCreditLines 106333 non-null float64 30 TotalCreditLinespast7years 113240 non-null float64 31 OpenRevolvingAccounts 113937 non-null int64 32 OpenRevolvingMonthlyPayment 113937 non-null float64 33 InquiriesLast6Months 113240 non-null float64 34 TotalInquiries 112778 non-null float64 35 CurrentDelinquencies 113240 non-null float64 36 AmountDelinquent 106315 non-null float64 37 DelinquenciesLast7Years 112947 non-null float64 38 PublicRecordsLast10Years 113240 non-null float64 39 PublicRecordsLast12Months 106333 non-null float64 40 RevolvingCreditBalance 106333 non-null float64 41 BankcardUtilization 106333 non-null float64 42 AvailableBankcardCredit 106393 non-null float64 43 TotalTrades 106393 non-null float64 44 TradesNeverDelinquent (percentage) 106393 non-null float64 45 TradesOpenedLast6Months 106393 non-null float64 46 DebtToIncomeRatio 105383 non-null float64 47 IncomeRange 113937 non-null object 48 IncomeVerifiable 113937 non-null bool 49 StatedMonthlyIncome 113937 non-null float64 50 LoanKey 113937 non-null object 51 TotalProsperLoans 22085 non-null float64 52 TotalProsperPaymentsBilled 22085 non-null float64 53 OnTimeProsperPayments 22085 non-null float64 54 ProsperPaymentsLessThanOneMonthLate 22085 non-null float64 55 ProsperPaymentsOneMonthPlusLate 22085 non-null float64 56 ProsperPrincipalBorrowed 22085 non-null float64 57 ProsperPrincipalOutstanding 22085 non-null float64 58 ScorexChangeAtTimeOfListing 18928 non-null float64 59 LoanCurrentDaysDelinquent 113937 non-null int64 60 LoanFirstDefaultedCycleNumber 16952 non-null float64 61 LoanMonthsSinceOrigination 113937 non-null int64 62 LoanNumber 113937 non-null int64 63 LoanOriginalAmount 113937 non-null int64 64 LoanOriginationDate 113937 non-null object 65 LoanOriginationQuarter 113937 non-null object 66 MemberKey 113937 non-null object 67 MonthlyLoanPayment 113937 non-null float64 68 LP_CustomerPayments 113937 non-null float64 69 LP_CustomerPrincipalPayments 113937 non-null float64 70 LP_InterestandFees 113937 non-null float64 71 LP_ServiceFees 113937 non-null float64 72 LP_CollectionFees 113937 non-null float64 73 LP_GrossPrincipalLoss 113937 non-null float64 74 LP_NetPrincipalLoss 113937 non-null float64 75 LP_NonPrincipalRecoverypayments 113937 non-null float64 76 PercentFunded 113937 non-null float64 77 Recommendations 113937 non-null int64 78 InvestmentFromFriendsCount 113937 non-null int64 79 InvestmentFromFriendsAmount 113937 non-null float64 80 Investors 113937 non-null int64 dtypes: bool(3), float64(50), int64(11), object(17) memory usage: 68.1+ MB
loan_original.head(10)
| ListingKey | ListingNumber | ListingCreationDate | CreditGrade | Term | LoanStatus | ClosedDate | BorrowerAPR | BorrowerRate | LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | ProsperRating (numeric) | ProsperRating (Alpha) | ProsperScore | ListingCategory (numeric) | BorrowerState | Occupation | EmploymentStatus | EmploymentStatusDuration | IsBorrowerHomeowner | CurrentlyInGroup | GroupKey | DateCreditPulled | CreditScoreRangeLower | CreditScoreRangeUpper | FirstRecordedCreditLine | CurrentCreditLines | OpenCreditLines | TotalCreditLinespast7years | OpenRevolvingAccounts | OpenRevolvingMonthlyPayment | InquiriesLast6Months | TotalInquiries | CurrentDelinquencies | AmountDelinquent | DelinquenciesLast7Years | PublicRecordsLast10Years | PublicRecordsLast12Months | RevolvingCreditBalance | BankcardUtilization | AvailableBankcardCredit | TotalTrades | TradesNeverDelinquent (percentage) | TradesOpenedLast6Months | DebtToIncomeRatio | IncomeRange | IncomeVerifiable | StatedMonthlyIncome | LoanKey | TotalProsperLoans | TotalProsperPaymentsBilled | OnTimeProsperPayments | ProsperPaymentsLessThanOneMonthLate | ProsperPaymentsOneMonthPlusLate | ProsperPrincipalBorrowed | ProsperPrincipalOutstanding | ScorexChangeAtTimeOfListing | LoanCurrentDaysDelinquent | LoanFirstDefaultedCycleNumber | LoanMonthsSinceOrigination | LoanNumber | LoanOriginalAmount | LoanOriginationDate | LoanOriginationQuarter | MemberKey | MonthlyLoanPayment | LP_CustomerPayments | LP_CustomerPrincipalPayments | LP_InterestandFees | LP_ServiceFees | LP_CollectionFees | LP_GrossPrincipalLoss | LP_NetPrincipalLoss | LP_NonPrincipalRecoverypayments | PercentFunded | Recommendations | InvestmentFromFriendsCount | InvestmentFromFriendsAmount | Investors | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1021339766868145413AB3B | 193129 | 2007-08-26 19:09:29.263000000 | C | 36 | Completed | 2009-08-14 00:00:00 | 0.16516 | 0.1580 | 0.1380 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | CO | Other | Self-employed | 2.0 | True | True | NaN | 2007-08-26 18:41:46.780000000 | 640.0 | 659.0 | 2001-10-11 00:00:00 | 5.0 | 4.0 | 12.0 | 1 | 24.0 | 3.0 | 3.0 | 2.0 | 472.0 | 4.0 | 0.0 | 0.0 | 0.0 | 0.00 | 1500.0 | 11.0 | 0.81 | 0.0 | 0.17 | $25,000-49,999 | True | 3083.333333 | E33A3400205839220442E84 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 78 | 19141 | 9425 | 2007-09-12 00:00:00 | Q3 2007 | 1F3E3376408759268057EDA | 330.43 | 11396.14 | 9425.00 | 1971.14 | -133.18 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 258 |
| 1 | 10273602499503308B223C1 | 1209647 | 2014-02-27 08:28:07.900000000 | NaN | 36 | Current | NaN | 0.12016 | 0.0920 | 0.0820 | 0.07960 | 0.0249 | 0.05470 | 6.0 | A | 7.0 | 2 | CO | Professional | Employed | 44.0 | False | False | NaN | 2014-02-27 08:28:14 | 680.0 | 699.0 | 1996-03-18 00:00:00 | 14.0 | 14.0 | 29.0 | 13 | 389.0 | 3.0 | 5.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 3989.0 | 0.21 | 10266.0 | 29.0 | 1.00 | 2.0 | 0.18 | $50,000-74,999 | True | 6125.000000 | 9E3B37071505919926B1D82 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 0 | 134815 | 10000 | 2014-03-03 00:00:00 | Q1 2014 | 1D13370546739025387B2F4 | 318.93 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 2 | 0EE9337825851032864889A | 81716 | 2007-01-05 15:00:47.090000000 | HR | 36 | Completed | 2009-12-17 00:00:00 | 0.28269 | 0.2750 | 0.2400 | NaN | NaN | NaN | NaN | NaN | NaN | 0 | GA | Other | Not available | NaN | False | True | 783C3371218786870A73D20 | 2007-01-02 14:09:10.060000000 | 480.0 | 499.0 | 2002-07-27 00:00:00 | NaN | NaN | 3.0 | 0 | 0.0 | 0.0 | 1.0 | 1.0 | NaN | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.06 | Not displayed | True | 2083.333333 | 6954337960046817851BCB2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 86 | 6466 | 3001 | 2007-01-17 00:00:00 | Q1 2007 | 5F7033715035555618FA612 | 123.32 | 4186.63 | 3001.00 | 1185.63 | -24.20 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 41 |
| 3 | 0EF5356002482715299901A | 658116 | 2012-10-22 11:02:35.010000000 | NaN | 36 | Current | NaN | 0.12528 | 0.0974 | 0.0874 | 0.08490 | 0.0249 | 0.06000 | 6.0 | A | 9.0 | 16 | GA | Skilled Labor | Employed | 113.0 | True | False | NaN | 2012-10-22 11:02:32 | 800.0 | 819.0 | 1983-02-28 00:00:00 | 5.0 | 5.0 | 29.0 | 7 | 115.0 | 0.0 | 1.0 | 4.0 | 10056.0 | 14.0 | 0.0 | 0.0 | 1444.0 | 0.04 | 30754.0 | 26.0 | 0.76 | 0.0 | 0.15 | $25,000-49,999 | True | 2875.000000 | A0393664465886295619C51 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 16 | 77296 | 10000 | 2012-11-01 00:00:00 | Q4 2012 | 9ADE356069835475068C6D2 | 321.45 | 5143.20 | 4091.09 | 1052.11 | -108.01 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 158 |
| 4 | 0F023589499656230C5E3E2 | 909464 | 2013-09-14 18:38:39.097000000 | NaN | 36 | Current | NaN | 0.24614 | 0.2085 | 0.1985 | 0.18316 | 0.0925 | 0.09066 | 3.0 | D | 4.0 | 2 | MN | Executive | Employed | 44.0 | True | False | NaN | 2013-09-14 18:38:44 | 680.0 | 699.0 | 2004-02-20 00:00:00 | 19.0 | 19.0 | 49.0 | 6 | 220.0 | 1.0 | 9.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 6193.0 | 0.81 | 695.0 | 39.0 | 0.95 | 2.0 | 0.26 | $100,000+ | True | 9583.333333 | A180369302188889200689E | 1.0 | 11.0 | 11.0 | 0.0 | 0.0 | 11000.0 | 9947.9 | NaN | 0 | NaN | 6 | 102670 | 15000 | 2013-09-20 00:00:00 | Q3 2013 | 36CE356043264555721F06C | 563.97 | 2819.85 | 1563.22 | 1256.63 | -60.27 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 20 |
| 5 | 0F05359734824199381F61D | 1074836 | 2013-12-14 08:26:37.093000000 | NaN | 60 | Current | NaN | 0.15425 | 0.1314 | 0.1214 | 0.11567 | 0.0449 | 0.07077 | 5.0 | B | 10.0 | 1 | NM | Professional | Employed | 82.0 | True | False | NaN | 2013-12-14 08:26:40 | 740.0 | 759.0 | 1973-03-01 00:00:00 | 21.0 | 17.0 | 49.0 | 13 | 1410.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 62999.0 | 0.39 | 86509.0 | 47.0 | 1.00 | 0.0 | 0.36 | $100,000+ | True | 8333.333333 | C3D63702273952547E79520 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 3 | 123257 | 15000 | 2013-12-24 00:00:00 | Q4 2013 | 874A3701157341738DE458F | 342.37 | 679.34 | 351.89 | 327.45 | -25.33 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 6 | 0F0A3576754255009D63151 | 750899 | 2013-04-12 09:52:56.147000000 | NaN | 36 | Current | NaN | 0.31032 | 0.2712 | 0.2612 | 0.23820 | 0.1275 | 0.11070 | 2.0 | E | 2.0 | 1 | KS | Sales - Retail | Employed | 172.0 | False | False | NaN | 2013-04-12 09:52:53 | 680.0 | 699.0 | 2000-09-29 00:00:00 | 10.0 | 7.0 | 20.0 | 6 | 214.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 5812.0 | 0.72 | 1929.0 | 16.0 | 0.68 | 0.0 | 0.27 | $25,000-49,999 | True | 2083.333333 | CE963680102927767790520 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 11 | 88353 | 3000 | 2013-04-18 00:00:00 | Q2 2013 | AA4535764146102879D5959 | 122.67 | 1226.70 | 604.25 | 622.45 | -22.95 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 7 | 0F1035772717087366F9EA7 | 768193 | 2013-05-05 06:49:27.493000000 | NaN | 36 | Current | NaN | 0.23939 | 0.2019 | 0.1919 | 0.17830 | 0.0799 | 0.09840 | 4.0 | C | 4.0 | 2 | CA | Laborer | Employed | 103.0 | False | False | NaN | 2013-05-05 06:49:25 | 700.0 | 719.0 | 1999-02-25 00:00:00 | 6.0 | 6.0 | 10.0 | 5 | 101.0 | 3.0 | 16.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1260.0 | 0.13 | 2181.0 | 10.0 | 0.80 | 0.0 | 0.24 | $25,000-49,999 | True | 3355.750000 | 0C87368108902149313D53B | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 10 | 90051 | 10000 | 2013-05-13 00:00:00 | Q2 2013 | 737F347089545035681C074 | 372.60 | 3353.40 | 1955.89 | 1397.51 | -69.21 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 8 | 0F043596202561788EA13D5 | 1023355 | 2013-12-02 10:43:39.117000000 | NaN | 36 | Current | NaN | 0.07620 | 0.0629 | 0.0529 | 0.05221 | 0.0099 | 0.04231 | 7.0 | AA | 9.0 | 7 | IL | Food Service | Employed | 269.0 | True | False | NaN | 2013-12-02 10:43:39 | 820.0 | 839.0 | 1993-04-01 00:00:00 | 17.0 | 16.0 | 32.0 | 12 | 219.0 | 1.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9906.0 | 0.11 | 77696.0 | 29.0 | 1.00 | 1.0 | 0.25 | $25,000-49,999 | True | 3333.333333 | 02163700809231365A56A1C | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 3 | 121268 | 10000 | 2013-12-12 00:00:00 | Q4 2013 | 49A53699682291323D04D66 | 305.54 | 611.08 | 505.58 | 105.50 | -16.77 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
| 9 | 0F043596202561788EA13D5 | 1023355 | 2013-12-02 10:43:39.117000000 | NaN | 36 | Current | NaN | 0.07620 | 0.0629 | 0.0529 | 0.05221 | 0.0099 | 0.04231 | 7.0 | AA | 11.0 | 7 | IL | Food Service | Employed | 269.0 | True | False | NaN | 2013-12-02 10:43:39 | 820.0 | 839.0 | 1993-04-01 00:00:00 | 17.0 | 16.0 | 32.0 | 12 | 219.0 | 1.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 9906.0 | 0.11 | 77696.0 | 29.0 | 1.00 | 1.0 | 0.25 | $25,000-49,999 | True | 3333.333333 | 02163700809231365A56A1C | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN | 3 | 121268 | 10000 | 2013-12-12 00:00:00 | Q4 2013 | 49A53699682291323D04D66 | 305.54 | 611.08 | 505.58 | 105.50 | -16.77 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0 | 0 | 0.0 | 1 |
loan_original['ListingCreationDate'].sort_values()
# ProsperScore A custom risk score built using historical Prosper data.
# The score ranges from 1-10, with 10 being the best, or lowest risk score.
# Applicable for loans originated after July 2009.
113921 2005-11-09 20:44:28.847000000
106735 2005-11-11 08:46:17.303000000
25166 2005-11-15 16:06:47.457000000
62297 2005-11-15 18:25:56.107000000
90598 2005-11-18 13:39:03.263000000
...
104205 2014-03-10 06:47:15.370000000
73267 2014-03-10 07:33:30.033000000
67741 2014-03-10 10:25:51.877000000
32376 2014-03-10 11:28:06.920000000
14131 2014-03-10 12:20:53.760000000
Name: ListingCreationDate, Length: 113937, dtype: object
duplicates = loan_original.duplicated().sum()
print('There are {} duplicate records in the dataset'.format(duplicates))
There are 0 duplicate records in the dataset
The dataset contains 113,937 rows and 81 columns. Each row serves as an individual loan entry and the columns describe features of the loan. The dataset contains information both interesting to borrowers and lenders.
In the interest of lenders, this analysis will aim to explore the nature of borrowers and loan categories to determine which loans result in the highest lender yields whilst holding the lowest risk. The exploration will dive into the different loan categories, borrower risk factors, loan yields and loan terms.
ListingCreationDate - The date the listing was created.
ProsperScore - A custom risk score built using historical Prosper data. The score ranges from 1-10, with 10 being the best, or lowest risk score. Applicable for loans originated after July 2009.
Term - The length of the loan expressed in months.
LoanStatus - The current status of the loan: Cancelled, Chargedoff, Completed, Current, Defaulted,
FinalPaymentInProgress, PastDue. The PastDue status will be accompanied by a delinquency bucket.
LenderYield - The Lender yield on the loan. Lender yield is equal to the interest rate on the loan less the servicing fee.
EstimatedEffectiveYield - Effective yield is equal to the borrower interest rate (i) minus the servicing fee rate, (ii) minus estimated uncollected interest on charge-offs, (iii) plus estimated collected late fees. Applicable for loans originated after July 2009.
EstimatedLoss - Estimated loss is the estimated principal loss on charge-offs. Applicable for loans originated after July 2009.
EstimatedReturn - The estimated return assigned to the listing at the time it was created. Estimated return is the difference between the Estimated Effective Yield and the Estimated Loss Rate. Applicable for loans originated after July 2009.
ListingCategory - The category of the listing that the borrower selected when posting their listing: 0 - Not Available, 1 - Debt Consolidation, 2 - Home Improvement, 3 - Business, 4 - Personal Loan, 5 - Student Use, 6 - Auto, 7- Other, 8 - Baby&Adoption, 9 - Boat, 10 - Cosmetic Procedure, 11 - Engagement Ring, 12 - Green Loans, 13 - Household Expenses, 14 - Large Purchases, 15 - Medical/Dental, 16 - Motorcycle, 17 - RV, 18 - Taxes, 19 - Vacation, 20 - Wedding Loans
EmploymentStatus - The employment status of the borrower at the time they posted the listing.
IncomeRange - The income range of the borrower at the time the listing was created.
DebtToIncomeRatio - The debt to income ratio of the borrower at the time the credit profile was pulled. This value is Null if the debt to income ratio is not available. This value is capped at 10.01 (any debt to income ratio larger than 1000% will be returned as 1001%).
LoanOriginalAmount - The origination amount of the loan.
CurrentDelinquencies - Number of accounts delinquent at the time the credit profile was pulled.
The next section will clean the following data wrangling issues observed after visual and programmatic assessment:
Copy of dataframe containing only necessary key features.
Incorrect datatypes for 'ListingCreationDate','LoanStatus', 'ListingCategory','EmploymentStatus','IncomeRange'
The numeric data in the 'ListingCategory (numeric)' can be converted to categorical data to provide description of reason for loan.
IncomeRange should be ordered as it is ordinal categorical data.
Dataset should be filtered to only include values post July 2009, as the Prosper Score feature will be heavily utilized in the dataset.
NaN values in the Income Range column can be replaced with $0 where employment status is not employed.
# selecting the key features columns to assign to new copy of dataframe
key_features = ['ListingCreationDate','ProsperScore','Term','LoanStatus',
'LenderYield','EstimatedEffectiveYield','EstimatedLoss','EstimatedReturn',
'ListingCategory (numeric)','EmploymentStatus','IncomeRange',
'DebtToIncomeRatio','LoanOriginalAmount','TradesNeverDelinquent (percentage)', 'CurrentCreditLines']
# copying the dataframe with selected key features columns
loan = loan_original[key_features].copy()
# testing
loan.head()
| ListingCreationDate | ProsperScore | Term | LoanStatus | LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | ListingCategory (numeric) | EmploymentStatus | IncomeRange | DebtToIncomeRatio | LoanOriginalAmount | TradesNeverDelinquent (percentage) | CurrentCreditLines | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2007-08-26 19:09:29.263000000 | NaN | 36 | Completed | 0.1380 | NaN | NaN | NaN | 0 | Self-employed | $25,000-49,999 | 0.17 | 9425 | 0.81 | 5.0 |
| 1 | 2014-02-27 08:28:07.900000000 | 7.0 | 36 | Current | 0.0820 | 0.07960 | 0.0249 | 0.05470 | 2 | Employed | $50,000-74,999 | 0.18 | 10000 | 1.00 | 14.0 |
| 2 | 2007-01-05 15:00:47.090000000 | NaN | 36 | Completed | 0.2400 | NaN | NaN | NaN | 0 | Not available | Not displayed | 0.06 | 3001 | NaN | NaN |
| 3 | 2012-10-22 11:02:35.010000000 | 9.0 | 36 | Current | 0.0874 | 0.08490 | 0.0249 | 0.06000 | 16 | Employed | $25,000-49,999 | 0.15 | 10000 | 0.76 | 5.0 |
| 4 | 2013-09-14 18:38:39.097000000 | 4.0 | 36 | Current | 0.1985 | 0.18316 | 0.0925 | 0.09066 | 2 | Employed | $100,000+ | 0.26 | 15000 | 0.95 | 19.0 |
loan[~loan['ProsperScore'].isna()]['ListingCreationDate'].min()
'2009-07-13 18:01:24.347000000'
loan['ListingCreationDate'].min(),loan['ListingCreationDate'].max()
('2005-11-09 20:44:28.847000000', '2014-03-10 12:20:53.760000000')
loan['ProsperScore'].value_counts()
4.0 12595 6.0 12278 8.0 12053 7.0 10597 5.0 9813 3.0 7642 9.0 6911 2.0 5766 10.0 4750 11.0 1456 1.0 992 Name: ProsperScore, dtype: int64
loan['ProsperScore'].isna().sum()
29084
loan.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingCreationDate 113937 non-null object 1 ProsperScore 84853 non-null float64 2 Term 113937 non-null int64 3 LoanStatus 113937 non-null object 4 LenderYield 113937 non-null float64 5 EstimatedEffectiveYield 84853 non-null float64 6 EstimatedLoss 84853 non-null float64 7 EstimatedReturn 84853 non-null float64 8 ListingCategory (numeric) 113937 non-null int64 9 EmploymentStatus 111682 non-null object 10 IncomeRange 113937 non-null object 11 DebtToIncomeRatio 105383 non-null float64 12 LoanOriginalAmount 113937 non-null int64 13 TradesNeverDelinquent (percentage) 106393 non-null float64 14 CurrentCreditLines 106333 non-null float64 dtypes: float64(8), int64(3), object(4) memory usage: 13.0+ MB
# ListingCreationDate to datetime
loan['ListingCreationDate'] = pd.to_datetime(loan['ListingCreationDate'])
# LoanStatus to category
loan['LoanStatus'] = loan['LoanStatus'].astype('category')
# ListingCategory (numeric) to category
loan['ListingCategory'] = loan['ListingCategory (numeric)'].astype('category')
# EmploymentStatus to category
loan['EmploymentStatus'] = loan['EmploymentStatus'].astype('category')
# IncomeRange to category
loan['IncomeRange'] = loan['IncomeRange'].astype('category')
# ProsperScore to category
loan['ProsperScore'] = loan['ProsperScore'].astype('category')
# Term to category
loan['Term'] = loan['Term'].astype('category')
# testing
loan.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 113937 entries, 0 to 113936 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingCreationDate 113937 non-null datetime64[ns] 1 ProsperScore 84853 non-null category 2 Term 113937 non-null category 3 LoanStatus 113937 non-null category 4 LenderYield 113937 non-null float64 5 EstimatedEffectiveYield 84853 non-null float64 6 EstimatedLoss 84853 non-null float64 7 EstimatedReturn 84853 non-null float64 8 ListingCategory (numeric) 113937 non-null int64 9 EmploymentStatus 111682 non-null category 10 IncomeRange 113937 non-null category 11 DebtToIncomeRatio 105383 non-null float64 12 LoanOriginalAmount 113937 non-null int64 13 TradesNeverDelinquent (percentage) 106393 non-null float64 14 CurrentCreditLines 106333 non-null float64 15 ListingCategory 113937 non-null category dtypes: category(6), datetime64[ns](1), float64(7), int64(2) memory usage: 9.3 MB
# creating a dictionary to replace numerical raiting with categorical descriptions
listing_categories = {0 : 'Not Available', 1 : 'Debt Consolidation', 2 : 'Home Improvement', 3: 'Business',
4 : 'Personal Loan', 5 : 'Student Use', 6 : 'Auto', 7 : 'Other', 8 : 'Baby & Adoption',
9 : 'Boat', 10 : 'Cosmetic Procedure', 11 : 'Engagement Ring', 12 : 'Green Loans',
13 : 'Household Expenses', 14 : 'Large Purchases', 15 : 'Medical or Dental', 16 : 'Motorcycle',
17 : 'RV', 18 : 'Taxes', 19 : 'Vacation', 20 : 'Wedding Loans'}
# mapping the dictionary contents to the column
loan['ListingCategory'] = loan['ListingCategory (numeric)'].map(listing_categories)
# dropping the numeric column as it is no longer needed for this analysis
loan.drop(columns=['ListingCategory (numeric)'], inplace=True)
# testing
loan['ListingCategory'].value_counts()
Debt Consolidation 58308 Not Available 16965 Other 10494 Home Improvement 7433 Business 7189 Auto 2572 Personal Loan 2395 Household Expenses 1996 Medical or Dental 1522 Taxes 885 Large Purchases 876 Wedding Loans 771 Vacation 768 Student Use 756 Motorcycle 304 Engagement Ring 217 Baby & Adoption 199 Cosmetic Procedure 91 Boat 85 Green Loans 59 RV 52 Name: ListingCategory, dtype: int64
# convert IncomeRange to ordered categorical types
ordered_income = {'IncomeRange': ['$0', '$1-24,999', '$25,000-49,999',
'$50,000-74,999', '$75,000-99,999', '$100,000+']}
# assign the correct order to the
for key,value in ordered_income.items():
ordered = pd.api.types.CategoricalDtype(categories=value,
ordered=True)
loan[key] = loan[key].astype(ordered)
# filtering for all dates following July 2009
loan = loan.query("(ListingCreationDate.dt.year > 2009) | ((ListingCreationDate.dt.year == 2009) & (ListingCreationDate.dt.month >= 7))")
# checking the earliest and latest date in the dataframe
loan['ListingCreationDate'].sort_values()
39013 2009-07-13 18:01:24.347
18374 2009-07-13 18:04:40.220
48139 2009-07-13 19:08:45.763
47705 2009-07-13 19:09:37.827
8072 2009-07-13 19:19:50.213
...
104205 2014-03-10 06:47:15.370
73267 2014-03-10 07:33:30.033
67741 2014-03-10 10:25:51.877
32376 2014-03-10 11:28:06.920
14131 2014-03-10 12:20:53.760
Name: ListingCreationDate, Length: 84853, dtype: datetime64[ns]
# count of data for each month
loan['ListingCreationDate'].dt.month.value_counts()
1 9201 10 8728 12 8441 11 8245 2 7963 9 7625 7 6638 8 6443 6 5808 5 5470 3 5385 4 4906 Name: ListingCreationDate, dtype: int64
#checking number of NaN and number of $0 values
loan['IncomeRange'].isna().sum(), loan['IncomeRange'].value_counts()
(649, $50,000-74,999 25627 $25,000-49,999 24175 $100,000+ 15205 $75,000-99,999 14498 $1-24,999 4654 $0 45 Name: IncomeRange, dtype: int64)
#checking number of Not Employed
loan['EmploymentStatus'].value_counts()
Employed 67310 Full-time 7927 Self-employed 4538 Other 3806 Not employed 649 Retired 367 Part-time 256 Not available 0 Name: EmploymentStatus, dtype: int64
# reassigning values in the Income range column where unemployed
loan['IncomeRange'] = np.where(loan['EmploymentStatus'] == 'Not employed', '$0', loan['IncomeRange'])
#testing for NaN and $0 values in IncomeRange
loan['IncomeRange'].isna().sum(), loan['IncomeRange'].value_counts()
(0, $50,000-74,999 25627 $25,000-49,999 24175 $100,000+ 15205 $75,000-99,999 14498 $1-24,999 4654 $0 694 Name: IncomeRange, dtype: int64)
In this section, investigate distributions of individual variables. If you see unusual points or outliers, take a deeper look to clean things up and prepare yourself to look at relationships between variables.
Rubric Tip: The project (Parts I alone) should have at least 15 visualizations distributed over univariate, bivariate, and multivariate plots to explore many relationships in the data set. Use reasoning to justify the flow of the exploration.
Rubric Tip: Use the "Question-Visualization-Observations" framework throughout the exploration. This framework involves asking a question from the data, creating a visualization to find answers, and then recording observations after each visualisation.
loan.describe()
| LenderYield | EstimatedEffectiveYield | EstimatedLoss | EstimatedReturn | DebtToIncomeRatio | LoanOriginalAmount | TradesNeverDelinquent (percentage) | CurrentCreditLines | |
|---|---|---|---|---|---|---|---|---|
| count | 84853.000000 | 84853.000000 | 84853.000000 | 84853.000000 | 77557.000000 | 84853.000000 | 84853.000000 | 84853.000000 |
| mean | 0.186017 | 0.168661 | 0.080306 | 0.096068 | 0.258752 | 9083.440515 | 0.905914 | 10.508032 |
| std | 0.074631 | 0.068467 | 0.046764 | 0.030403 | 0.318326 | 6287.860058 | 0.121353 | 5.317401 |
| min | 0.030000 | -0.182700 | 0.004900 | -0.182700 | 0.000000 | 1000.000000 | 0.080000 | 0.000000 |
| 25% | 0.125900 | 0.115670 | 0.042400 | 0.074080 | 0.150000 | 4000.000000 | 0.850000 | 7.000000 |
| 50% | 0.177500 | 0.161500 | 0.072400 | 0.091700 | 0.220000 | 7500.000000 | 0.950000 | 10.000000 |
| 75% | 0.247400 | 0.224300 | 0.112000 | 0.116600 | 0.320000 | 13500.000000 | 1.000000 | 13.000000 |
| max | 0.340000 | 0.319900 | 0.366000 | 0.283700 | 10.010000 | 35000.000000 | 1.000000 | 59.000000 |
binsize = 1000
bins = np.arange(0, loan['LoanOriginalAmount'].max()+binsize, binsize, )
# creating 3 subplots
fig, axes = plt.subplots(nrows=3,figsize=[12, 14], dpi=250)
# main title:
fig.suptitle("Loan Amount Distribution", fontsize=18)
# the plots:
sb.histplot(data=loan, x='LoanOriginalAmount', ax = axes[0], bins=bins, log=True);
sb.boxplot(data=loan, x='LoanOriginalAmount', ax=axes[1]);
sb.ecdfplot(data=loan, x='LoanOriginalAmount', ax=axes[2]);
# histogram customization:
mean_price = np.round(loan["LoanOriginalAmount"].mean(), 2)
axes[0].axvline(mean_price, color="orange", linestyle="-.", linewidth=1.5)
axes[0].text(
x=mean_price + 100,
y=10500,
s=f"mean={mean_price}$",
style="italic",
color='orange'
)
# ecdf customization
axes[2].axhline(0.55, color='black')
axes[2].axvline(9000, color='orange')
axes[2].text(
x=9000 + 100,
y=0.95,
s=f"9000$",
style="italic",
color='orange'
)
# log scale histogram customization
axes[0].set_yticks(np.logspace(1, 4, 4), ['10', '100', '1000', '10000'])
# titles and labels
axes[0].set_title('Loan Amount histogram plot')
axes[1].set_title('Loan Amount boxplot')
axes[2].set_title('Loan Amount Cumulative Distribution plot')
axes[0].set_xlabel("Loan Amount ($)");
axes[1].set_xlabel("Loan Amount ($)")
axes[2].set_xlabel("Loan Amount ($)")
plt.tight_layout()
The histogram and box-plot for loan amount is right skewed. The mean loan amount is around $9 000. It would be interesting to investigate why the majority of the loans are smaller and in which categories these loans exist.
The cumulative distribution plot shows us that 55% of loans are below 9000 USD.
# Observing the count of loan category
default_color = sb.color_palette()[0]
ax = sb.countplot(x=loan['ListingCategory'],order=loan['ListingCategory'].value_counts(ascending=False).index, color = default_color);
rel_values = loan['Term'].value_counts(ascending=False, normalize=True).values * 100
ax.bar_label(container=ax.containers[0], labels=rel_values.round(2), rotation=90)
plt.xticks(rotation=90);
Surprisingly the majority of loans are taken out to pay off existing loans, accounting for almost 70% of all loans in the dataset. Unfortunately no category descriptions are available for the two runner up categories i.e. not available and other. The fourth and fifth biggest categories are home improvement and business loans respectively. These two categories are less surprising and one would possibly expect them to be bigger.
plt.figure(figsize=(8, 8))
binsize = 0.025
bins = np.arange(0, loan['LenderYield'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loan, x='LenderYield', bins=bins)
plt.xlabel('LenderYield (%)')
plt.show()
<Figure size 800x800 with 0 Axes>
fig, axes = plt.subplots(1, 3, figsize=(12, 4), dpi=300)
fig.suptitle("Estimations on Yields, Losses and Returns", fontsize=16)
fig.supylabel('Count')
for idx,col in enumerate(['EstimatedEffectiveYield','EstimatedLoss','EstimatedReturn']):
sb.histplot(
data=loan,
x=col,
ax=axes[idx],
bins=20
)
# add numbers above the bars:
axes[idx].bar_label(axes[idx].containers[0])
# lets remove y axeis and y ticks:
axes[idx].set_yticks([])
axes[idx].set(yticklabels=[])
# we will change the labels:
axes[idx].set_xlabel("")
axes[idx].set_ylabel("")
axes[idx].set_title(f"{col} histogram");
axes[idx].tick_params(axis="x", labelrotation=30)
plt.tight_layout();
The estimation histograms show a promising result with yields and returns being left skewed, indicating higher yields and returns. And the estimated losses being right skewed, indicating that losses on loans are generally expected to be relatively smaller on a loss scale.
binsize = 0.125
bins = np.arange(0, loan['DebtToIncomeRatio'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loan, x='DebtToIncomeRatio', bins=bins)
# removing outliers
plt.xlim(0, 2)
# plot labels
plt.xlabel('Debt To Income Ratio');
The debt to income ratio is right skewed which means that the debt to income ratio for the majority of borrowers is in an appropriate range or lower. A general rule of thumb is to keep your debt to income ratio at or below 43%.
# Observing the Prosper Scores, Income Ranger and Employment Status
fig, ax = plt.subplots(nrows=4, figsize = [8,8],dpi=250)
default_color = sb.color_palette()[0]
sb.countplot(data = loan, x = 'ProsperScore', color = default_color, ax = ax[0])
sb.countplot(data = loan, x = 'IncomeRange', color = default_color, ax = ax[1])
sb.countplot(data = loan, x = 'EmploymentStatus', color = default_color, ax = ax[2])
sb.countplot(data = loan, x = 'LoanStatus', color = default_color, ax = ax[3])
plt.xticks(rotation=45)
plt.tight_layout();
/Users/carafinlay/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: Index.ravel returning ndarray is deprecated; in a future version this will return a view on self. grouped_vals = vals.groupby(grouper)
The prosper scores are mostly in a mid range between 4 and 8.
The majority of borrowers earn in the income ranges of 25,000-49,999 and 50,000-74,999 USD income range. Since it is difficult to obtain a low with a very loan income, this income range is not surprising.
The bulk of borrowers are employed at least partially, if not full-time. It would appear that obtaining a loan as self-employed would be surprisingly difficult. This graph could also indicate that working part-time or being retired would make loan approval significantly more challenging. Unsurprisingly, no employment equates to no loans.
The majority of the loans in this dataset is either current or completed, another promising indicator of lower risk to lenders.
binsize = 0.05
bins = np.arange(0, loan['TradesNeverDelinquent (percentage)'].max()+binsize, binsize)
plt.figure(figsize=[8, 5])
plt.hist(data=loan, x='TradesNeverDelinquent (percentage)', bins=bins)
plt.xlabel('TradesNeverDelinquent(%)');
The left skewed histogram shows that the majority of borrowers don't have any delinquent trade history. This shows promise to lenders that loans will be repaid in time and is perhaps expected assuming borrowers can't easily re-obtain loans once trades are delinquent.
default_color = sb.color_palette()[0]
ax = sb.countplot(x=loan['Term'],order=loan['Term'].value_counts(ascending=False).index, color = default_color);
rel_values = loan['Term'].value_counts(ascending=False, normalize=True).values * 100
ax.bar_label(container=ax.containers[0], labels=rel_values.round(2), rotation=90)
plt.xticks(rotation=90)
plt.xlabel('Term (months)');
Another surprising finding can be observed here. Almost 70% of loans are mid term i.e. 3 years. Again, I would be interested to find out which loan category these mid term loans belong to. If they predominantly are for debt consolidation it may indicate that borrowers have accumulated debt and although are paying it off, they are chasing more debt in pursuit of avoiding payment delinquencies.
fig, axes = plt.subplots(1,3,figsize=[12, 4], dpi=250)
fig.suptitle("Current Credit Lines Distribution", fontsize=16)
sb.histplot(data=loan, x='CurrentCreditLines', ax = axes[0]);
sb.boxplot(data=loan, x='CurrentCreditLines', ax=axes[1]);
sb.ecdfplot(data=loan, x='CurrentCreditLines', ax=axes[2]);
axes[2].set_title('Current Credit Lines Distribution plot')
axes[0].set_title('Current Credit Lines histogram plot')
axes[1].set_title('Current Credit Lines boxplot')
# histogram customization:
#mean_count = np.round(loan["CurrentCreditLines"].mean(), 2)
#axes[0].axvline(mean_count, color="orange", linestyle="-.", linewidth=1.5)
#axes[0].text(
# x=mean_price,
# y=8500,
# s=f"mean={mean_price}",
# style="italic",
# color='orange'
#)
plt.tight_layout();
The previous finding in regards to loan term, led to the observation of current credit lines among borrowers. Borrowers have an average of 10.32 current credit lines. Although this was expected, the extent of current loans is shocking. Only around 10% of borrowers have 5 current credit lines and around 10% have 20 or more.
Surprisingly almost 70% of loans are for debt consolidation. On average borrowers have 10 current credit lines and many have even more. Although delinquencies are low, given the current findings it appears that borrowers are in a debt cycle, which may be accumulative. Further investigation to determine accumulation would be interesting for analysis.
In section 1.4.0.5 Observing the Debt to Income Ratio outliers were removed so that only data between 0 and 2 is plotted.
Appropriate bin sizes were chosen for histograms.
In this section, investigate relationships between pairs of variables in your data. Make sure the variables that you cover here have been introduced in some fashion in the previous section (univariate exploration).
loan.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 84853 entries, 1 to 113936 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ListingCreationDate 84853 non-null datetime64[ns] 1 ProsperScore 84853 non-null category 2 Term 84853 non-null category 3 LoanStatus 84853 non-null category 4 LenderYield 84853 non-null float64 5 EstimatedEffectiveYield 84853 non-null float64 6 EstimatedLoss 84853 non-null float64 7 EstimatedReturn 84853 non-null float64 8 EmploymentStatus 84853 non-null category 9 IncomeRange 84853 non-null object 10 DebtToIncomeRatio 77557 non-null float64 11 LoanOriginalAmount 84853 non-null int64 12 TradesNeverDelinquent (percentage) 84853 non-null float64 13 CurrentCreditLines 84853 non-null float64 14 ListingCategory 84853 non-null object dtypes: category(4), datetime64[ns](1), float64(7), int64(1), object(2) memory usage: 8.1+ MB
categoric_vars = ['IncomeRange', 'EmploymentStatus','Term','ProsperScore']
fig, axes = plt.subplots(4, 2, figsize = [16,16], dpi=800)
fig.suptitle("Income Range, Employment Status Vs Loan Amount", fontsize=16)
for idx, col in enumerate(categoric_vars):
sb.barplot(data = loan, x=col, y= 'LoanOriginalAmount', ax=axes[idx][0], palette='crest')
sb.boxplot(data = loan, y='LoanOriginalAmount', x = col, ax=axes[idx][1], palette='crest')
axes[idx][1].set_ylabel("")
axes[idx][0].tick_params(axis="x", labelrotation=30)
axes[idx][1].tick_params(axis="x", labelrotation=30)
plt.tight_layout();
/Users/carafinlay/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: Index.ravel returning ndarray is deprecated; in a future version this will return a view on self. grouped_vals = vals.groupby(grouper) /Users/carafinlay/opt/anaconda3/lib/python3.9/site-packages/seaborn/categorical.py:641: FutureWarning: Index.ravel returning ndarray is deprecated; in a future version this will return a view on self. grouped_vals = vals.groupby(grouper)
Income range vs Loan Amount: In a previous count graph we observed very few borrowers in the 0 USD income range, therefore it is surprising to see that the mean loan amount for this category is around 7000 USD. The rest of the plot is as expected, i.e the lower the income range the lower the loan amount. The right skewed box plot for 0USD income category confirms that there are some high value loans for individuals without any income.
Employment status vs Loan Amount: We also saw in a previous plot that there were few borrowers in the self-employed category. Here we see that despite the low number of borrowers in this category, the mean loan amount in very high at 7 000 USD compared to the employed category with the highest mean loan amount of almost 10 000 USD. This graph is surprising in general as the unemployed category has a much higher mean than expected. The right skewed box plot for unemployed category confirms that there are some high value loans for individuals without any employment. This coincides with the income range observations.
Term vs Loan Amount: Observations as as expected. The lower the loan amount the lower the term.
Prosper Score vs Loan Amount: Observations as as expected. The higher the prosper score the higher the loan amount.
# plot matrix: sample 800
numeric_vars = ['LenderYield', 'EstimatedEffectiveYield', 'EstimatedLoss','EstimatedReturn','LoanOriginalAmount','CurrentCreditLines']
print("loan.shape=",loan.shape)
loans_samp = loan.sample(n=800, replace=False)
print("loan_samp.shape=",loans_samp.shape)
g = sb.PairGrid(data=loans_samp, vars=numeric_vars, dropna=True, diag_sharey=False)
g.map_diag(sb.kdeplot)
g.map_upper(sb.scatterplot)
g.map_lower(sb.kdeplot);
loan.shape= (84853, 15) loan_samp.shape= (800, 15)
Positive and highly correlated observations:
g = sb.regplot(data=loan, x='EstimatedLoss', y='LenderYield',
scatter_kws={'alpha': 0.3}, line_kws={'color':'orange'});
g.set(title='Lender Yield vs Estimated Loss',
xlabel = 'Estimated Loss',
ylabel = 'Lender Yield')
plt.savefig("Lender Yield vs Estimated Loss.png", dpi=300)
g = sb.regplot(data=loan.sample(3000, random_state=1), x='LoanOriginalAmount', y='LenderYield',
x_jitter=200, scatter_kws={'alpha': 0.3}, line_kws={'color':'orange'});
g.set(title='Lender Yield vs Loan Amount',
xlabel = 'Loan Amount ($)',
ylabel = 'Lender Yield')
plt.savefig("Lender Yield vs Loan Amount.png", dpi=300)
base_color = sb.color_palette()[0]
sb.barplot(data=loan, x='ListingCategory', y='LoanOriginalAmount', color=base_color, order=loan.groupby('ListingCategory')['LoanOriginalAmount'].mean().sort_values(ascending=False).index)
plt.xticks(rotation=90);
plt.ylabel('Avg. Original Loan Amount');
sb.regplot(data=loan[loan['LoanStatus']=='Current'].sample(5000, random_state=1), x='LoanOriginalAmount', y='CurrentCreditLines',
x_jitter=200, scatter_kws={'alpha': 0.3});
g = sb.relplot(data=loan, x='EstimatedLoss', y='LenderYield',
hue='EmploymentStatus', height=4, aspect=1.5)
g.set(title='Lender Yield vs Estimated Loss based on Employment Status',
xlabel='Estimated Loss',
ylabel='Lender Yield')
g.savefig("Lender Yield vs Estimated Loss based on Employment Status.png", dpi=300)
g = sb.relplot(data=loan, x='EstimatedLoss', y='LenderYield', hue='ProsperScore', height=4, aspect=1.5);
g.set(title='Lender Yield vs Estimated Loss based on Prosper Score',
xlabel='Estimated Loss',
ylabel='Lender Yield')
g.savefig("Lender Yield vs Estimated Loss based on Prosper Score.png", dpi=300)
g = sb.relplot(data=loan, x='EstimatedLoss', y='LenderYield', hue='Term', height=4, aspect=1.5);
g.set(title='Lender Yield vs Estimated Loss based on Loan Term(Months)',
xlabel='Estimated Loss',
ylabel='Lender Yield')
g.savefig("Lender Yield vs Estimated Loss based on Loan Term.png", dpi=300)
g = sb.relplot(data=loan.sample(500, random_state=1), x='LoanOriginalAmount', y='LenderYield',
size='ProsperScore', hue='ProsperScore', alpha=0.8, height=4, aspect=1.5)
g.set(title='Lender Yield vs Loan Amount based on Prosper Score',
xlabel='Loan Amount',
ylabel='Lender Yield')
g.savefig("Lender Yield vs Loan Amount based on Prosper Score.png", dpi=300)
# filtering the loan dataset
loana = loan[(loan['LoanOriginalAmount'] <= 5000) & (loan['CurrentCreditLines'] <= 10)]
g = sb.relplot(data=loana.sample(1000, random_state=1), x='EstimatedLoss', y='LenderYield',
size='ProsperScore', hue='ProsperScore', alpha=0.8, height=4, aspect=1.5);
g.set(title='Lender Yield vs Estimated Loss',
xlabel='Estimated Loss',
ylabel='Lender Yield')
g.savefig("ender Yield vs Estimated Loss for Loans less than 5000USD and borrower has sub 10 Credit Lines.png", dpi=300)
plt.figure(figsize=(8, 5));
g = sb.relplot(data=loan, x='EstimatedLoss', y='LenderYield',
hue='Term',kind='line', alpha=0.8, height=4, aspect=1.5);
g.set(title='Lender Yield vs Estimated Loss based on Term (months)',
xlabel='Estimated Loss',
ylabel='Lender Yield')
g.savefig("Lender Yield vs Estimated Loss based on Term", dpi=300)
<Figure size 800x500 with 0 Axes>